MODIS, SeaWiFS, VIIRS-SNPP, OLCI-A, OLCI-B

Code

library(dplyr)
library(tidyr)
library(oceancolouR)
library(ggplot2)
library(mapdata)
library(patchwork)
library(DT)
library(grid)
library(scales) # for breaks_pretty in boxplot axis labels


# restrict matchups to these years (-Inf, Inf for no restrictions)
years <- c(-Inf,Inf)

# region bounds
lat_bounds <- c(39,63)
lon_bounds <- c(-71,-42)

max_dist <- 10000 # metres

max_depth <- 10 # in situ depth, metres

# maximum number of matchup pixels for each in situ record (within max_dist from in situ data)
max_matchups <- 100 # from the input file
max_matchups_new <- 25 # further restrictions to the output

# use the median of the "max_matchup_new" closest points? (or use the single closest point?)
use_median <- TRUE

olci_flags <- c("INVALID", "LAND", "TURBID", "ICE", "CLOUD1", "CLOUD2")

input_path <- "02_matchups/"


#*******************************************************************************
# READ AND FORMAT DATA

# use this to collect data from all sensors/variables to plot to the map later, color-coded by sensor
full_df <- data.frame(matrix(nrow=0, ncol=5), stringsAsFactors = FALSE)
colnames(full_df) <- c("Sensor", "Year", "DOY", "Latitude", "Longitude")
full_df$Sensor <- as.character(full_df$Sensor)
full_df[,2:3] <- lapply(full_df[,2:3], as.integer)
full_df[,4:5] <- lapply(full_df[,4:5], as.numeric)

data_list <- list()
plot_lm_list <- list()
plot_box_list <- list()
list_ind <- 1

num_cols <- length(unlist(all_variables[names(all_variables) %in% sensors]))
stat_df <- data.frame(matrix(nrow=length(sensors), ncol=10), stringsAsFactors = FALSE)
colnames(stat_df) <- c("Sensor", "Variable", "Flagged", "Years", "Intercept", "Slope", "R^2", "RMSE", "pvalue", "num_obs")


# loop through each sensor, and each of the variables for the sensor, and calculate statistics, fits, and create maps and plots
for (i in 1:length(sensors)) {
    
    sensor <- sensors[i]
    
    variables <- all_variables[[sensor]]
    
    for (j in 1:length(variables)) {
        
        variable <- variables[j]
        
        input_fname <- gsub(" flagged", "", paste0("matchups_L3b_daily_", sensor, "_", variable, "_maxdist_", max_dist, "_maxdepth_", max_depth, "_maxmatchups_", max_matchups, ".csv"))
        
        if (!file.exists(paste0(input_path, input_fname))) {next}
        
        flagged <- grepl("flagged", sensor)
        
        # this must be different because CHL-OC5 is written in the table as CHL.OC5
        df_sat_var <- ifelse(variable=="CHL-OC5", "CHL.OC5", variable)
        
        df <- read.csv(paste0(input_path, input_fname)) %>%
            dplyr::mutate_if(is.numeric, round, digits = 3)
        
        # assign the variable names to a generic name for easier use below
        df$sat_var <- df[, paste0("sat_", df_sat_var)]
        df$in_situ_var <- df[, "HPLC_CHLA"]
        
        df <- df %>%
            
            # remove bad data
            dplyr::filter(is.finite(sat_var) & sat_var > 0 & is.finite(in_situ_var) & in_situ_var > 0) %>%
            
            # restrict to a certain number of matchups for each day/pixel
            dplyr::group_by(YEAR, DOY, LATDEC, LONGDEC, DEPTH) %>%
            dplyr::slice_min(order_by = dist_to_matchup, n = max_matchups_new) %>%
            dplyr::ungroup() %>%
            
            # retrieve a single matchup for comparison to in situ data (either single closest, or median of closest pixels)
            dplyr::arrange(., cruise_id, ID, DEPTH, LATDEC, LONGDEC, YEAR, DOY, HPLC_CHLA, HPLC_FUCOX, dist_to_matchup) %>%
            dplyr::group_by(., cruise_id, ID, DEPTH, LATDEC, LONGDEC, YEAR, DOY, HPLC_CHLA, HPLC_FUCOX)
        
        if (use_median) {
          # use the median of all closest matchups
          df <- df %>%
            dplyr::summarize_all(., .funs = median, na.rm=TRUE) %>%
            dplyr::select(-pancan_lon, -pancan_lat)
        } else {
          # OR use the closest matchup
          df <- df %>%
            dplyr::summarize_all(., .funs = dplyr::first)
        }
        
        df <- df %>%
            dplyr::ungroup() %>%
            dplyr::arrange(., YEAR, DOY)
        
        
        # restrict to certain years
        if (all(is.finite(years))) {
          df <- df %>% dplyr::filter(between(YEAR, years[1], years[2]))
          tmp_years <- c(max(min(df$YEAR, na.rm=TRUE),years[1]), min(max(df$YEAR, na.rm=TRUE), years[2]))
        } else if (is.finite(years[1])) {
          df <- df %>% dplyr::filter(YEAR >= years[1])
          tmp_years <- c(max(min(df$YEAR, na.rm=TRUE),years[1]), max(df$YEAR, na.rm=TRUE))
        } else if (is.finite(years[2])) {
          df <- df %>% dplyr::filter(YEAR <= years[2])
          tmp_years <- c(min(df$YEAR, na.rm=TRUE), min(max(df$YEAR, na.rm=TRUE), years[2]))
        } else {
          tmp_years <- range(df$YEAR, na.rm=TRUE)
        }
        
        
        # restrict lat/lon
        df <- df %>%
          dplyr::filter(between(LATDEC,lat_bounds[1],lat_bounds[2]), between(LONGDEC,lon_bounds[1],lon_bounds[2]))
        
        # collect data for mapping later
        full_df <- dplyr::bind_rows(full_df,
                                    df %>%
                                      dplyr::mutate(Sensor=sensor) %>%
                                      dplyr::rename(Year=YEAR, Latitude=LATDEC, Longitude=LONGDEC) %>%
                                      dplyr::select(Sensor, Year, DOY, Latitude, Longitude))
        
        # FOR OLCI, RESTRICT BY FLAG VALUES
        # All rows should be kept for printing and plotting, but flagged rows should not be used in the stats, linear regression, etc., and flagged values should be colored differently
        if (startsWith(sensor, "OLCI") & flagged & length(olci_flags) > 0) {
            df <- df %>%
              dplyr::mutate(mask = rowSums(.[names(.) %in% paste0("MASK_", olci_flags)], na.rm=TRUE)) %>%
              dplyr::mutate(mask = ifelse(mask > 0, 1, 0))
              tmp_df_stats <- df %>% dplyr::filter(mask != 1)
        } else {
            df$mask <- rep(0, nrow(df))
            tmp_df_stats <- df
        }
        
        # adjust the mask column - for non-flagged values, include info on season
        df[df$mask==0 & df$DOY <= 181, "mask"] <- 2
        df[df$mask==0 & df$DOY > 181, "mask"] <- 3
        df$mask <- factor(df$mask)
        
        
        #***********************************************************************
        # CALCULATE STATS
        
        errors <- vector_errors(as.numeric(unlist(df[,"HPLC_CHLA"])),
                            as.numeric(unlist(df[,paste0("sat_", df_sat_var)])))
        df <- dplyr::bind_cols(df, round(errors,3))
        
        mdf_lm <- lm(log10(sat_var) ~ log10(in_situ_var), data=tmp_df_stats)
        
        stat_df[list_ind,] <- c(sensor, variable, flagged,
                                paste0(tmp_years, collapse="-"),
                                round(as.numeric(coef(mdf_lm)[1]),3),
                                round(as.numeric(coef(mdf_lm)[2]),3),
                                round(summary(mdf_lm)$r.squared,3),
                                round(rmse(log10(tmp_df_stats$sat_var), log10(tmp_df_stats$in_situ_var)),3),
                                formatC(lmp(mdf_lm), format="e", digits = 3),
                                nobs(mdf_lm))
        
        
        #***********************************************************************
        # CREATE LINEAR MODEL PLOT
        
        p_lm <- ggplot(df) +
              geom_point(aes(x=in_situ_var, y=sat_var, color=mask), size=1.5, alpha=0.7) +
              # invisible dummy layer that will force the full range of colors for the legend, even if the points aren't on this graph
              geom_point(data=data.frame(sat_var=1:3, in_situ_var=1:3, mask=as.factor(1:3), stringsAsFactors = FALSE), aes(x=in_situ_var, y=sat_var, color=mask), alpha = 0) +
              # 1:1 line
              geom_abline(slope=1, intercept=0) +
              # linear model
              geom_abline(slope=coef(mdf_lm)[2], intercept=coef(mdf_lm)[1], colour="red") +
              # log scales
              scale_x_continuous(limits=c(0.1,10), trans="log10") +
              scale_y_continuous(limits=c(0.1,10), trans="log10") +
              labs(x=paste0("In situ HPLC_CHLA"),
                   y=variable) +
              theme_bw() +
              scale_colour_manual(breaks = 1:3, values = c("black", "darkgreen", "red"), labels = c("Flagged", "Spring", "Fall")) +
              theme(legend.position="right",
                    legend.title=element_blank(),
                    legend.text=element_text(size = 16)) +
              annotation_custom(grobTree(textGrob(sensor, x=0.05,  y=0.95, hjust=0, gp=gpar(fontsize=12)))) +
              # adjust size of points in legend
              guides(colour=guide_legend(override.aes=list(size=5)))
        
        plot_lm_list[[list_ind]] <- p_lm
        
        
        #***********************************************************************
        # CREATE ERROR BOXPLOTS
        
        p_box <- ggplot(df[df$mask != 1,], aes(x=factor(YEAR), y=percent_error, fill=factor(mask))) +
              geom_boxplot(outlier.shape=21, outlier.size=2) +
              # invisible dummy layer that will force the full range of colors for the legend, even if the points aren't on this graph
              geom_point(data=data.frame(YEAR=tmp_years[2], percent_error=rep(0,2), mask=as.factor(2:3), stringsAsFactors = FALSE), aes(x=factor(YEAR), y=percent_error, fill=factor(mask)), alpha = 0) +
              scale_y_continuous(limits=c(-200,2000), breaks=c(-100,-10,-1,1,10,100,1000), labels=c(-100,-10,-1,1,10,100,1000), trans="asinh") +
              scale_x_discrete(breaks=breaks_pretty(n=round(diff(range(df$YEAR, na.rm=TRUE))/2))) +
              scale_fill_manual(values=c("2"="#33EE55", "3"="#FF3333"), labels=c("Spring", "Fall")) +
              geom_hline(yintercept=mean(unlist(df[df$mask==2, "percent_error"]), na.rm=TRUE),color='#11CC33',alpha=0.7) +
              geom_hline(yintercept=mean(unlist(df[df$mask==3, "percent_error"]), na.rm=TRUE),color='#FF3333',alpha=0.7) +
              theme_bw() +
              labs(fill="Season", y=expression(paste('[',italic('Chla'),'] % error '))) +
              theme(legend.position="right",
                    legend.title=element_text(size = 20),
                    legend.text=element_text(size = 16),
                    legend.key.size = unit(3,"line"),
                    axis.title.x=element_blank(),
                    axis.text.x=element_text(size=10,angle=45,colour='black',vjust=1, hjust=1),
                    axis.title.y=element_text(size=12,colour="black")) +
              annotation_custom(grobTree(textGrob(sensor, x=0.05,  y=0.95, hjust=0, gp=gpar(fontsize=12))))
        
        plot_box_list[[list_ind]] <- p_box
        
        list_ind <- list_ind + 1
        
    }

}

Statistics table

Below are the statistics from each set of in situ / satellite matchups, restricted to satellite matchups within 10000 metres of the in situ data sampling location, and a depth of 10 metres.

Satellite data and distance to matchup are the median values of the 25 closest binned pixels to the in situ sampling location, within 10000 metres.

Linear model plots

Flags used for OLCI: INVALID, LAND, TURBID, ICE, CLOUD1, CLOUD2
“Fall” points are defined as anything after day of year 181, and “Spring” points before that. Flagged points are in black, and have not been used in the model.

Error boxplots

Percent error for each sensor and variable are plotted below. Green and red boxes represent spring and fall error respectively, and green and red lines represent the mean value of each.

Map

Matchups are mapped below, colour-coded by sensor. Note that many matchups for different sensors might overlap.